library(ggplot2)
library(corrplot)
## corrplot 0.84 loaded
library(leaps)
library(gridExtra)
library(glmnet)
## Loading required package: Matrix
## Loading required package: foreach
## Loaded glmnet 2.0-13

Cleaning the Data

Let’s begin by reading the census data

census_df_full = read.csv("nyc_census.csv") # read in file
set.seed(1) # reproducibility
dim(census_df_full)
## [1] 2167   36

Our data contains 2167 entries and 36 variables (or features)

names(census_df_full)
##  [1] "CensusTract"     "County"          "Borough"        
##  [4] "TotalPop"        "Men"             "Women"          
##  [7] "Hispanic"        "White"           "Black"          
## [10] "Native"          "Asian"           "Citizen"        
## [13] "Income"          "IncomeErr"       "IncomePerCap"   
## [16] "IncomePerCapErr" "Poverty"         "ChildPoverty"   
## [19] "Professional"    "Service"         "Office"         
## [22] "Construction"    "Production"      "Drive"          
## [25] "Carpool"         "Transit"         "Walk"           
## [28] "OtherTransp"     "WorkAtHome"      "MeanCommute"    
## [31] "Employed"        "PrivateWork"     "PublicWork"     
## [34] "SelfEmployed"    "FamilyWork"      "Unemployment"

We will Remove columns that does not pertain to predicting median household income as well as create a second dataframe (one for Income and one for IncomePerCap).

census_df_income <- census_df_full[,-which(names(census_df_full) %in% c("Borough", 
                                                                 "IncomeErr",
                                                                 "IncomePerCap",
                                                                 "IncomePerCapErr"))]

census_df_ipc <- census_df_full[,-which(names(census_df_full) %in% c("Borough", 
                                                                 "Income",
                                                                 "IncomeErr",
                                                                 "IncomePerCapErr"))]
summary(census_df_income)
##   CensusTract             County       TotalPop          Men       
##  Min.   :3.601e+10   Bronx   :339   Min.   :    0   Min.   :    0  
##  1st Qu.:3.605e+10   Kings   :761   1st Qu.: 2360   1st Qu.: 1113  
##  Median :3.605e+10   New York:288   Median : 3550   Median : 1699  
##  Mean   :3.605e+10   Queens  :669   Mean   : 3889   Mean   : 1853  
##  3rd Qu.:3.608e+10   Richmond:110   3rd Qu.: 4958   3rd Qu.: 2360  
##  Max.   :3.609e+10                  Max.   :28926   Max.   :13460  
##                                                                    
##      Women          Hispanic          White            Black       
##  Min.   :    0   Min.   :  0.00   Min.   :  0.00   Min.   :  0.00  
##  1st Qu.: 1224   1st Qu.:  9.00   1st Qu.:  4.10   1st Qu.:  1.50  
##  Median : 1848   Median : 18.40   Median : 22.95   Median :  7.80  
##  Mean   : 2036   Mean   : 26.62   Mean   : 33.06   Mean   : 23.95  
##  3rd Qu.: 2572   3rd Qu.: 39.90   3rd Qu.: 60.10   3rd Qu.: 39.08  
##  Max.   :15466   Max.   :100.00   Max.   :100.00   Max.   :100.00  
##                  NA's   :39       NA's   :39       NA's   :39      
##      Native            Asian          Citizen          Income      
##  Min.   : 0.0000   Min.   : 0.00   Min.   :    0   Min.   :  9829  
##  1st Qu.: 0.0000   1st Qu.: 2.10   1st Qu.: 1446   1st Qu.: 39073  
##  Median : 0.0000   Median : 6.70   Median : 2140   Median : 54505  
##  Mean   : 0.1986   Mean   :13.44   Mean   : 2436   Mean   : 59101  
##  3rd Qu.: 0.0000   3rd Qu.:18.93   3rd Qu.: 2976   3rd Qu.: 73272  
##  Max.   :11.3000   Max.   :89.80   Max.   :22905   Max.   :244375  
##  NA's   :39        NA's   :39                      NA's   :66      
##     Poverty        ChildPoverty    Professional       Service     
##  Min.   :  0.00   Min.   : 0.00   Min.   :  0.00   Min.   : 0.00  
##  1st Qu.:  9.20   1st Qu.: 8.30   1st Qu.: 23.00   1st Qu.:15.70  
##  Median : 16.70   Median :21.10   Median : 33.60   Median :24.55  
##  Mean   : 19.57   Mean   :24.48   Mean   : 36.55   Mean   :24.23  
##  3rd Qu.: 26.70   3rd Qu.:37.55   3rd Qu.: 45.80   3rd Qu.:32.12  
##  Max.   :100.00   Max.   :95.70   Max.   :100.00   Max.   :59.80  
##  NA's   :42       NA's   :60      NA's   :43       NA's   :43     
##      Office        Construction      Production          Drive       
##  Min.   :  0.00   Min.   : 0.000   Min.   :  0.000   Min.   :  0.00  
##  1st Qu.: 19.30   1st Qu.: 3.300   1st Qu.:  5.500   1st Qu.: 10.90  
##  Median : 23.10   Median : 6.100   Median :  8.950   Median : 20.75  
##  Mean   : 23.43   Mean   : 6.634   Mean   :  9.157   Mean   : 24.88  
##  3rd Qu.: 26.80   3rd Qu.: 9.200   3rd Qu.: 12.600   3rd Qu.: 36.33  
##  Max.   :100.00   Max.   :43.200   Max.   :100.000   Max.   :100.00  
##  NA's   :43       NA's   :43       NA's   :43        NA's   :43      
##     Carpool          Transit            Walk          OtherTransp    
##  Min.   : 0.000   Min.   :  0.00   Min.   :  0.000   Min.   : 0.000  
##  1st Qu.: 2.000   1st Qu.: 43.30   1st Qu.:  3.300   1st Qu.: 0.500  
##  Median : 4.200   Median : 57.40   Median :  6.500   Median : 1.500  
##  Mean   : 5.088   Mean   : 54.93   Mean   :  9.048   Mean   : 2.307  
##  3rd Qu.: 7.200   3rd Qu.: 67.90   3rd Qu.: 10.800   3rd Qu.: 3.200  
##  Max.   :26.400   Max.   :100.00   Max.   :100.000   Max.   :55.600  
##  NA's   :43       NA's   :43       NA's   :43        NA's   :43      
##    WorkAtHome       MeanCommute       Employed      PrivateWork    
##  Min.   :  0.000   Min.   :15.20   Min.   :    0   Min.   : 38.60  
##  1st Qu.:  1.300   1st Qu.:37.20   1st Qu.: 1052   1st Qu.: 75.00  
##  Median :  2.800   Median :41.40   Median : 1579   Median : 79.90  
##  Mean   :  3.749   Mean   :40.83   Mean   : 1813   Mean   : 79.54  
##  3rd Qu.:  5.100   3rd Qu.:45.38   3rd Qu.: 2274   3rd Qu.: 84.40  
##  Max.   :100.000   Max.   :70.50   Max.   :12780   Max.   :100.00  
##  NA's   :43        NA's   :61                      NA's   :43      
##    PublicWork     SelfEmployed      FamilyWork      Unemployment    
##  Min.   : 0.00   Min.   : 0.000   Min.   :0.0000   Min.   :  0.000  
##  1st Qu.: 8.80   1st Qu.: 3.500   1st Qu.:0.0000   1st Qu.:  5.900  
##  Median :13.30   Median : 5.600   Median :0.0000   Median :  8.800  
##  Mean   :14.19   Mean   : 6.151   Mean   :0.1186   Mean   :  9.758  
##  3rd Qu.:18.70   3rd Qu.: 8.200   3rd Qu.:0.0000   3rd Qu.: 12.600  
##  Max.   :53.70   Max.   :61.400   Max.   :3.9000   Max.   :100.000  
##  NA's   :43      NA's   :43       NA's   :43       NA's   :42

As we can see in the above summary, Our Data still contain missing values. We will remove the rows where values are missing.

census_df_naomit_income <- na.omit(census_df_income)
census_df_naomit_ipc <- na.omit(census_df_ipc)

dim(census_df_naomit_income)
## [1] 2095   32
dim(census_df_naomit_ipc)
## [1] 2100   32

We will also remove the column ‘County’ because, while it might be preditictive, it will not contribute any interesting explanatory value. Additionally, we will remove ‘White’, ‘Men’, ‘PrivateWork’, ‘Professional’, and ‘Transit’ because those data are repetitive as there is as there are other sources of data that correlate with them due to the fact that the sum of all their values adds to 100%.

census_df_rd2_income <- census_df_naomit_income[,-which(names(census_df_naomit_income) %in% c("County",
                                                                         "White",
                                                                         "Men",
                                                                         "PrivateWork",
                                                                         "Professional",
                                                                         "Transit"))]

census_df_rd2_ipc <- census_df_naomit_ipc[,-which(names(census_df_naomit_ipc) %in% c("County",
                                                                         "White",
                                                                         "Men",
                                                                         "PrivateWork",
                                                                         "Professional",
                                                                         "Transit"))]
dim(census_df_rd2_income)
## [1] 2095   26
dim(census_df_rd2_ipc)
## [1] 2100   26

Exploratory Analysis

First, lets look at the correlations between our variables. (Census Track not considered for same reason as borough).

corrMat = cor(census_df_rd2_income[,-1], use = "pairwise.complete.obs")
par(mfrow = c(1,1))
corrplot(corrMat, method = "circle",title = "Median Income",mar=c(0,0,1,0))

corrMat = cor(census_df_rd2_ipc[,-1], use = "pairwise.complete.obs")
par(mfrow = c(1,1))
corrplot(corrMat, method = "circle",title = "Income Per Capita",mar=c(0,0,1,0))

TotalPopulation is highly corrlelated with Citizen (Number of Citizens). We will remove it as it is repetitive and show the remaining 24 variables for each dataframe.

census_df_rd2_income <- census_df_rd2_income[,-which(names(census_df_rd2_income) %in% c("TotalPop"))]
census_df_pred_income <- census_df_rd2_income[,-which(names(census_df_rd2_income) %in% c("CensusTract"))]
dim(census_df_pred_income)
## [1] 2095   24
names(census_df_pred_income)
##  [1] "Women"        "Hispanic"     "Black"        "Native"      
##  [5] "Asian"        "Citizen"      "Income"       "Poverty"     
##  [9] "ChildPoverty" "Service"      "Office"       "Construction"
## [13] "Production"   "Drive"        "Carpool"      "Walk"        
## [17] "OtherTransp"  "WorkAtHome"   "MeanCommute"  "Employed"    
## [21] "PublicWork"   "SelfEmployed" "FamilyWork"   "Unemployment"
census_df_rd2_ipc <- census_df_rd2_ipc[,-which(names(census_df_rd2_ipc) %in% c("TotalPop"))]
census_df_pred_ipc <- census_df_rd2_ipc[,-which(names(census_df_rd2_ipc) %in% c("CensusTract"))]
dim(census_df_pred_ipc)
## [1] 2100   24
names(census_df_pred_ipc)
##  [1] "Women"        "Hispanic"     "Black"        "Native"      
##  [5] "Asian"        "Citizen"      "IncomePerCap" "Poverty"     
##  [9] "ChildPoverty" "Service"      "Office"       "Construction"
## [13] "Production"   "Drive"        "Carpool"      "Walk"        
## [17] "OtherTransp"  "WorkAtHome"   "MeanCommute"  "Employed"    
## [21] "PublicWork"   "SelfEmployed" "FamilyWork"   "Unemployment"

Now we will use Stepwise regressions to determine which features are the most predictive for each model. To determine how many features to include in each model, we will look at the number of features and some model evaluation metrics: RSS, Cp, Adjusted R^2, and BIC.

Income

set.seed(1) # reproducibility

regfit.full.income = regsubsets(Income~., data=census_df_pred_income, nvmax = 23)
reg.summary.income = summary(regfit.full.income)

par(mfrow = c(2,2))
plot(reg.summary.income$rss,xlab="Number of Variables",ylab="RSS",type="l")
plot(reg.summary.income$cp,xlab="Number of Variables",ylab="Cp",type='l')
plot(reg.summary.income$adjr2,xlab="Number of Variables",ylab="Adjusted RSq",type="l")
plot(reg.summary.income$bic,xlab="Number of Variables",ylab="BIC",type='l')

which.min(reg.summary.income$bic)
## [1] 14

As we can see, after 14 variables have been implemented for our income model, RSS and Cp does not decrease, additionally, the Adjusted R^2 values do not increase much after 14 variables are implemented, and BIC finds its minimum (optimal) value at 14 variables.

IncomePerCap

set.seed(2) # reproducibility

regfit.full.ipc = regsubsets(IncomePerCap~., data=census_df_pred_ipc, nvmax = 23)
reg.summary.ipc = summary(regfit.full.ipc)

par(mfrow = c(2,2))
plot(reg.summary.ipc$rss,xlab="Number of Variables",ylab="RSS",type="l")
plot(reg.summary.ipc$cp,xlab="Number of Variables",ylab="Cp",type='l')
plot(reg.summary.ipc$adjr2,xlab="Number of Variables",ylab="Adjusted RSq",type="l")
plot(reg.summary.ipc$bic,xlab="Number of Variables",ylab="BIC",type='l')

which.min(reg.summary.ipc$bic)
## [1] 13

By examining the plots, we can see that after about 13 features are implemented in our IncomePerCap model, the RSS and Cp do not decrease much further. Additionally, the Adjusted R^2 does not increase much beyond 13 features and the BIC finds its minimum at 13 variables implemented.

To gather further evidence for the most predictive features, we will use cross-validation to find which subsets give the smallest MSE on a test set.

Income

set.seed(1000)
predict.regsubsets = function(object, newdata, id, ...){
  form = as.formula(object$call[[2]])
  mat = model.matrix(form, newdata)
  coefi = coef(object, id=id)
  xvars = names(coefi)
  mat[,xvars]%*%coefi
}

folds_income <- sample(1:10, nrow(census_df_pred_income), replace = TRUE)
cv.mse.income <- matrix(NA, nrow = 10, ncol = 23)
for(i in 1:10){
  best.sub <- regsubsets(Income~., data = census_df_pred_income[folds_income!=i,], nvmax = 23)
  for(j in 1:23){
    pred.s <- predict(best.sub, census_df_pred_income[folds_income ==i,], id = j)
    cv.mse.income[i,j] <- mean((census_df_pred_income$Income[folds_income ==i]- pred.s)^2)
  }
}
avg.cv.mse.income <- apply(cv.mse.income,2,mean)
plot(avg.cv.mse.income, type = "b", main = "MSE- 10 fold MSE, Best Subsets")
v = which.min(avg.cv.mse.income)
abline(v = which.min(avg.cv.mse.income), col = "blue")

As we can see above, the minimum MSE is found at 14 variables - the exact same optimal number of values found in BIC, RSS, Cp, and Adjusted R^2 plots.

IncomePerCap

set.seed(2000)
predict.regsubsets = function(object, newdata, id, ...){
  form = as.formula(object$call[[2]])
  mat = model.matrix(form, newdata)
  coefi = coef(object, id=id)
  xvars = names(coefi)
  mat[,xvars]%*%coefi
}

folds_ipc <- sample(1:10, nrow(census_df_pred_ipc), replace = TRUE)
cv.mse.ipc <- matrix(NA, nrow = 10, ncol = 23)
for(i in 1:10){
  best.sub <- regsubsets(IncomePerCap~., data = census_df_pred_ipc[folds_ipc!=i,], nvmax = 23)
  for(j in 1:23){
    pred.s <- predict(best.sub, census_df_pred_ipc[folds_ipc ==i,], id = j)
    cv.mse.ipc[i,j] <- mean((census_df_pred_ipc$IncomePerCap[folds_ipc ==i]- pred.s)^2)
  }
}
avg.cv.mse.ipc <- apply(cv.mse.ipc,2,mean)
plot(avg.cv.mse.ipc, type = "b", main = "MSE- 10 fold MSE, Best Subsets")
v = which.min(avg.cv.mse.ipc)
abline(v = which.min(avg.cv.mse.ipc), col = "blue")

IncomePerCap finds it’s minimum subset at 20 variables implemented. Although this is larger than the 13 variables we saw earlier, we can see by the plot that the decreases in MSE past 13 variables are negligible. Therefore, we will proceed with our IncomePerCap model with the most predictive 13 variables.

We can now examine the most predictive variables and their coefficients.

For our Income Model

regfit.best.income = regsubsets(Income ~ ., data = census_df_pred_income, nvmax = 23)
coef(regfit.best.income, 14)
##   (Intercept)      Hispanic         Black       Citizen       Poverty 
## 108185.896835     67.552656     62.630255     -3.973985   -986.731531 
##       Service        Office  Construction    Production         Drive 
##   -746.465538   -737.809956   -622.012848   -757.316100    365.923942 
##          Walk   OtherTransp    WorkAtHome      Employed    PublicWork 
##    310.108552   1223.835573    989.439797      4.693423   -256.564918

For our IncomePerCap Model

regfit.best.ipc = regsubsets(IncomePerCap ~ ., data = census_df_pred_ipc, nvmax = 23)
coef(regfit.best.ipc, 13)
##   (Intercept)         Asian       Poverty       Service        Office 
## 84514.0799342   -90.1087828  -615.9488241  -519.3038778  -544.3457615 
##  Construction    Production       Carpool          Walk   OtherTransp 
##  -727.3798415  -719.3725050  -235.4293857   401.2525869  1630.6422407 
##    WorkAtHome   MeanCommute      Employed    PublicWork 
##   536.7431216  -194.7644181     0.9142642  -295.6461710

Evaluating Models

We will start by creating a base model - that is a model with all variables included.

Income

model.base.income = lm(Income ~ ., data = census_df_pred_income)
summary(model.base.income)
## 
## Call:
## lm(formula = Income ~ ., data = census_df_pred_income)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -44174  -8750   -926   7472 113712 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   1.116e+05  4.170e+03  26.774  < 2e-16 ***
## Women        -8.714e-01  1.078e+00  -0.808 0.419017    
## Hispanic      5.071e+01  2.457e+01   2.064 0.039147 *  
## Black         5.114e+01  1.902e+01   2.688 0.007246 ** 
## Native        5.988e+01  4.715e+02   0.127 0.898961    
## Asian        -1.933e+01  2.716e+01  -0.712 0.476609    
## Citizen      -3.813e+00  8.950e-01  -4.261 2.13e-05 ***
## Poverty      -1.048e+03  7.493e+01 -13.982  < 2e-16 ***
## ChildPoverty  4.039e+01  4.299e+01   0.939 0.347596    
## Service      -7.302e+02  5.385e+01 -13.559  < 2e-16 ***
## Office       -7.383e+02  6.147e+01 -12.010  < 2e-16 ***
## Construction -6.147e+02  8.958e+01  -6.862 8.93e-12 ***
## Production   -7.122e+02  8.738e+01  -8.151 6.20e-16 ***
## Drive         3.637e+02  3.325e+01  10.936  < 2e-16 ***
## Carpool      -4.040e+01  9.677e+01  -0.417 0.676375    
## Walk          2.915e+02  5.319e+01   5.480 4.76e-08 ***
## OtherTransp   1.187e+03  1.511e+02   7.850 6.62e-15 ***
## WorkAtHome    9.876e+02  1.314e+02   7.515 8.40e-14 ***
## MeanCommute  -6.611e+01  7.324e+01  -0.903 0.366770    
## Employed      5.383e+00  1.001e+00   5.379 8.36e-08 ***
## PublicWork   -2.713e+02  7.021e+01  -3.864 0.000115 ***
## SelfEmployed -1.386e+02  1.050e+02  -1.321 0.186762    
## FamilyWork    1.495e+03  8.089e+02   1.849 0.064652 .  
## Unemployment  1.138e+02  8.304e+01   1.370 0.170829    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 14370 on 2071 degrees of freedom
## Multiple R-squared:  0.754,  Adjusted R-squared:  0.7512 
## F-statistic: 275.9 on 23 and 2071 DF,  p-value: < 2.2e-16

IncomePerCap

model.base.ipc = lm(IncomePerCap ~ ., data = census_df_pred_ipc)
summary(model.base.ipc)
## 
## Call:
## lm(formula = IncomePerCap ~ ., data = census_df_pred_ipc)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -48738  -6036   -415   5110 145925 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   8.449e+04  3.834e+03  22.036  < 2e-16 ***
## Women        -1.585e+00  9.921e-01  -1.598 0.110193    
## Hispanic     -1.104e+01  2.266e+01  -0.487 0.626307    
## Black         5.052e+00  1.752e+01   0.288 0.773111    
## Native       -1.211e+02  4.166e+02  -0.291 0.771239    
## Asian        -8.404e+01  2.502e+01  -3.359 0.000797 ***
## Citizen      -5.388e-02  8.235e-01  -0.065 0.947837    
## Poverty      -6.580e+02  6.819e+01  -9.649  < 2e-16 ***
## ChildPoverty  4.918e+01  3.889e+01   1.265 0.206192    
## Service      -5.341e+02  4.963e+01 -10.763  < 2e-16 ***
## Office       -5.695e+02  5.667e+01 -10.051  < 2e-16 ***
## Construction -7.221e+02  8.005e+01  -9.021  < 2e-16 ***
## Production   -7.013e+02  8.019e+01  -8.746  < 2e-16 ***
## Drive         3.056e+01  3.064e+01   0.997 0.318728    
## Carpool      -2.341e+02  8.908e+01  -2.628 0.008656 ** 
## Walk          4.153e+02  4.889e+01   8.495  < 2e-16 ***
## OtherTransp   1.645e+03  1.368e+02  12.024  < 2e-16 ***
## WorkAtHome    6.055e+02  1.169e+02   5.178 2.45e-07 ***
## MeanCommute  -2.141e+02  6.733e+01  -3.180 0.001494 ** 
## Employed      2.661e+00  9.216e-01   2.887 0.003928 ** 
## PublicWork   -3.371e+02  6.461e+01  -5.217 2.00e-07 ***
## SelfEmployed -1.917e+02  9.599e+01  -1.997 0.045940 *  
## FamilyWork    1.486e+03  7.456e+02   1.992 0.046460 *  
## Unemployment  1.949e+02  7.643e+01   2.550 0.010844 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13260 on 2076 degrees of freedom
## Multiple R-squared:  0.7286, Adjusted R-squared:  0.7256 
## F-statistic: 242.3 on 23 and 2076 DF,  p-value: < 2.2e-16

Next, we will create an Income model with the features we found most predictive: Hispanic, Black, Citizen, Poverty, Service, Office, Construction, Production, Drive, Walk, OtherTransp, WorkAtHome, Employed, and PublicWork.

model.income.1 = lm(Income 
              ~ Hispanic 
               + Black 
               + Citizen 
               + Poverty 
               + Service 
               + Office 
               + Construction 
               + Production 
               + Drive 
               + Walk 
               + OtherTransp 
               + WorkAtHome 
               + Employed 
               + PublicWork
               , data=census_df_pred_income)
summary(model.income.1)
## 
## Call:
## lm(formula = Income ~ Hispanic + Black + Citizen + Poverty + 
##     Service + Office + Construction + Production + Drive + Walk + 
##     OtherTransp + WorkAtHome + Employed + PublicWork, data = census_df_pred_income)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -44504  -8797  -1025   7345 116360 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   1.082e+05  2.701e+03  40.059  < 2e-16 ***
## Hispanic      6.755e+01  2.103e+01   3.212 0.001339 ** 
## Black         6.263e+01  1.540e+01   4.067 4.94e-05 ***
## Citizen      -3.974e+00  6.533e-01  -6.083 1.40e-09 ***
## Poverty      -9.867e+02  4.155e+01 -23.750  < 2e-16 ***
## Service      -7.465e+02  4.968e+01 -15.025  < 2e-16 ***
## Office       -7.378e+02  5.978e+01 -12.342  < 2e-16 ***
## Construction -6.220e+02  8.804e+01  -7.065 2.18e-12 ***
## Production   -7.573e+02  8.346e+01  -9.073  < 2e-16 ***
## Drive         3.659e+02  3.155e+01  11.599  < 2e-16 ***
## Walk          3.101e+02  4.597e+01   6.746 1.96e-11 ***
## OtherTransp   1.224e+03  1.485e+02   8.242 2.95e-16 ***
## WorkAtHome    9.894e+02  1.256e+02   7.879 5.26e-15 ***
## Employed      4.693e+00  8.786e-01   5.342 1.02e-07 ***
## PublicWork   -2.566e+02  6.870e+01  -3.735 0.000193 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 14370 on 2080 degrees of freedom
## Multiple R-squared:  0.7528, Adjusted R-squared:  0.7511 
## F-statistic: 452.4 on 14 and 2080 DF,  p-value: < 2.2e-16

Next, we will create an IncomePerCap model with the features we found most predictive: Asian, Poverty, Service, Office, Construction, Production, Carpool, Walk, OtherTransp, WorkAtHome, MeanCommute, Employed, and PublicWork.

model.ipc.1 = lm(IncomePerCap
                 ~Asian 
                 + Poverty 
                 + Service 
                 + Office 
                 + Construction 
                 + Production 
                 + Carpool 
                 + Walk 
                 + OtherTransp 
                 + WorkAtHome 
                 + MeanCommute 
                 + Employed 
                 + PublicWork 
               , data=census_df_pred_ipc)
summary(model.ipc.1)
## 
## Call:
## lm(formula = IncomePerCap ~ Asian + Poverty + Service + Office + 
##     Construction + Production + Carpool + Walk + OtherTransp + 
##     WorkAtHome + MeanCommute + Employed + PublicWork, data = census_df_pred_ipc)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -53757  -5985   -322   5061 146391 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  84514.0799  3335.2807  25.339  < 2e-16 ***
## Asian          -90.1088    20.6482  -4.364 1.34e-05 ***
## Poverty       -615.9488    33.8138 -18.216  < 2e-16 ***
## Service       -519.3039    41.4322 -12.534  < 2e-16 ***
## Office        -544.3458    53.7503 -10.127  < 2e-16 ***
## Construction  -727.3798    75.5406  -9.629  < 2e-16 ***
## Production    -719.3725    75.5659  -9.520  < 2e-16 ***
## Carpool       -235.4294    83.2682  -2.827 0.004738 ** 
## Walk           401.2526    46.0729   8.709  < 2e-16 ***
## OtherTransp   1630.6422   132.7297  12.285  < 2e-16 ***
## WorkAtHome     536.7431   112.9506   4.752 2.15e-06 ***
## MeanCommute   -194.7644    64.3192  -3.028 0.002491 ** 
## Employed         0.9143     0.2715   3.368 0.000772 ***
## PublicWork    -295.6462    54.0760  -5.467 5.12e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13290 on 2086 degrees of freedom
## Multiple R-squared:  0.7259, Adjusted R-squared:  0.7242 
## F-statistic:   425 on 13 and 2086 DF,  p-value: < 2.2e-16

Both of these optimized models have about the same Adjusted R^2 value as their base - 0.75 and 0.72 respectively. However, many of the variables in the base model have large p-values while all of the variables in model.1 have p-values << 0.05.

We can also evaluate these models using cross-validation to see how each model performs on a test set. We will use MSE to evaluate their errors.

set.seed(1)
k = 10
folds = cut(seq(1,nrow(census_df_pred_income)),breaks=10,labels=FALSE)
folds = folds[sample(length(folds))]
cv.errors.income = matrix(NA,k,2)

for (j in 1:k){
  lm.fit.base = lm(Income ~ .,data=census_df_pred_income[folds!=j,])
  lm.fit.1 = lm(Income ~ Hispanic + Black + Citizen + Poverty + Service + Office + Construction + Production + Drive + Walk + OtherTransp + WorkAtHome + Employed + PublicWork, data=census_df_pred_income[folds!=j,])
  pred.base = predict(lm.fit.base,census_df_pred_income[folds==j,])
  pred.1 = predict(lm.fit.1,census_df_pred_income[folds==j,])
  cv.errors.income[j,1] = mean((census_df_pred_income$Income[folds==j]-pred.base)^2)
  cv.errors.income[j,2] = mean((census_df_pred_income$Income[folds==j]-pred.1)^2)
}
print(cv.errors.income)
##            [,1]      [,2]
##  [1,] 247879680 246836976
##  [2,] 221620154 223088617
##  [3,] 160014769 158283690
##  [4,] 258397058 255427178
##  [5,] 198196387 198809931
##  [6,] 304679816 298684128
##  [7,] 186919020 186499589
##  [8,] 155710151 150963255
##  [9,] 192408682 188101206
## [10,] 190297725 187816911
mean.cv.errors.income = apply(cv.errors.income,2,mean)
print(mean.cv.errors.income)
## [1] 211612344 209451148

As we can see, the Income Model with the optimal number of variable has a lower test MSE than the full model. This makes sense as the model does not overfit the training data as much.

set.seed(1)
k = 10
folds = cut(seq(1,nrow(census_df_pred_ipc)),breaks=10,labels=FALSE)
folds = folds[sample(length(folds))]
cv.errors.ipc = matrix(NA,k,2)

for (j in 1:k){
  lm.fit.base = lm(IncomePerCap ~ .,data=census_df_pred_ipc[folds!=j,])
  lm.fit.2 = lm(IncomePerCap~Asian + Poverty + Service + Office + Construction + Production + Carpool + Walk + OtherTransp + WorkAtHome + MeanCommute + Employed + PublicWork, 
                data=census_df_pred_ipc[folds!=j,])
  pred.base = predict(lm.fit.base,census_df_pred_ipc[folds==j,])
  pred.2 = predict(lm.fit.2,census_df_pred_ipc[folds==j,])
  cv.errors.ipc[j,1] = mean((census_df_pred_ipc$Income[folds==j]-pred.base)^2)
  cv.errors.ipc[j,2] = mean((census_df_pred_ipc$Income[folds==j]-pred.2)^2)
}
print(cv.errors.ipc)
##            [,1]      [,2]
##  [1,] 146809657 147028309
##  [2,] 131829521 129256616
##  [3,] 173366261 174743128
##  [4,] 185522940 188463304
##  [5,] 178770820 181541819
##  [6,] 160169039 161235644
##  [7,] 238083423 234009814
##  [8,] 117567234 116031224
##  [9,] 217426196 212450407
## [10,] 265348193 260025982
mean.cv.errors.ipc = apply(cv.errors.ipc,2,mean)
print(mean.cv.errors.ipc)
## [1] 181489329 180478625

Again, as expected, the optimized model for IncomePerCap has a smaller MSE value because it did not overfit the training set as much.

Perhaps, we can improve upon this model. Let’s look at the plots of each variable vs. Income/IncomePerCap, it seems some might have data that are skewed.

Income

p1_income <- ggplot(census_df_pred_income, aes(Hispanic,Income)) + geom_point() + geom_smooth(method ="lm")
p2_income <- ggplot(census_df_pred_income, aes(Black,Income)) + geom_point() + geom_smooth(method ="lm")
p3_income <- ggplot(census_df_pred_income, aes(Citizen,Income)) + geom_point() + geom_smooth(method ="lm")
p4_income <- ggplot(census_df_pred_income, aes(Poverty,Income)) + geom_point() + geom_smooth(method ="lm")
p5_income <- ggplot(census_df_pred_income, aes(Service,Income)) + geom_point() + geom_smooth(method ="lm")
p6_income <- ggplot(census_df_pred_income, aes(Office,Income)) + geom_point() + geom_smooth(method ="lm")
p7_income <- ggplot(census_df_pred_income, aes(Construction,Income)) + geom_point() + geom_smooth(method ="lm")
p8_income <- ggplot(census_df_pred_income, aes(Production,Income)) + geom_point() + geom_smooth(method ="lm")
p9_income <- ggplot(census_df_pred_income, aes(Drive,Income)) + geom_point() + geom_smooth(method ="lm")
p10_income <- ggplot(census_df_pred_income, aes(Walk,Income)) + geom_point() + geom_smooth(method ="lm")
p11_income <- ggplot(census_df_pred_income, aes(OtherTransp,Income)) + geom_point() + geom_smooth(method ="lm")
p12_income <- ggplot(census_df_pred_income, aes(WorkAtHome,Income)) + geom_point() + geom_smooth(method ="lm")
p13_income <- ggplot(census_df_pred_income, aes(Employed,Income)) + geom_point() + geom_smooth(method ="lm")
p14_income <- ggplot(census_df_pred_income, aes(PublicWork,Income)) + geom_point() + geom_smooth(method ="lm")

grid.arrange(p1_income,p2_income,p3_income,p4_income,p5_income,p6_income,p7_income,p8_income,p9_income,p10_income,p11_income,p12_income,p13_income,p14_income)

IncomePerCap

p1_spc <- ggplot(census_df_pred_ipc, aes(Asian,IncomePerCap)) + geom_point() + geom_smooth(method ="lm")
p2_spc <- ggplot(census_df_pred_ipc, aes(Poverty,IncomePerCap)) + geom_point() + geom_smooth(method ="lm")
p3_spc <- ggplot(census_df_pred_ipc, aes(Service,IncomePerCap)) + geom_point() + geom_smooth(method ="lm")
p4_spc <- ggplot(census_df_pred_ipc, aes(Office,IncomePerCap)) + geom_point() + geom_smooth(method ="lm")
p5_spc <- ggplot(census_df_pred_ipc, aes(Construction,IncomePerCap)) + geom_point() + geom_smooth(method ="lm")
p6_spc <- ggplot(census_df_pred_ipc, aes(Production,IncomePerCap)) + geom_point() + geom_smooth(method ="lm")
p7_spc <- ggplot(census_df_pred_ipc, aes(Carpool,IncomePerCap)) + geom_point() + geom_smooth(method ="lm")
p8_spc <- ggplot(census_df_pred_ipc, aes(Walk,IncomePerCap)) + geom_point() + geom_smooth(method ="lm")
p9_spc <- ggplot(census_df_pred_ipc, aes(OtherTransp,IncomePerCap)) + geom_point() + geom_smooth(method ="lm")
p10_spc <- ggplot(census_df_pred_ipc, aes(WorkAtHome,IncomePerCap)) + geom_point() + geom_smooth(method ="lm")
p11_spc <- ggplot(census_df_pred_ipc, aes(MeanCommute,IncomePerCap)) + geom_point() + geom_smooth(method ="lm")
p12_spc <- ggplot(census_df_pred_ipc, aes(Employed,IncomePerCap)) + geom_point() + geom_smooth(method ="lm")
p13_spc <- ggplot(census_df_pred_ipc, aes(PublicWork,IncomePerCap)) + geom_point() + geom_smooth(method ="lm")

grid.arrange(p1_spc,p2_spc,p3_spc,p4_spc,p5_spc,p6_spc,p7_spc,p8_spc,p9_spc,p10_spc,p11_spc,p12_spc,p13_spc)

We can also look at the kernel density for each variable to get a better idea of the data’s distribution.

Income

plot(density(census_df_pred_income$Hispanic), xlab = 'Hispanic', ylab = 'Density', main = 'Kernel Density Plot')

plot(density(census_df_pred_income$Black), xlab = 'Black', ylab = 'Density', main = 'Kernel Density Plot')

plot(density(census_df_pred_income$Citizen), xlab = 'Citizen', ylab = 'Density', main = 'Kernel Density Plot')

plot(density(census_df_pred_income$Poverty), xlab = 'Poverty', ylab = 'Density', main = 'Kernel Density Plot')

plot(density(census_df_pred_income$Service), xlab = 'Service', ylab = 'Density', main = 'Kernel Density Plot')

plot(density(census_df_pred_income$Office), xlab = 'Office', ylab = 'Density', main = 'Kernel Density Plot')

plot(density(census_df_pred_income$Construction), xlab = 'Construction', ylab = 'Density', main = 'Kernel Density Plot')

plot(density(census_df_pred_income$Production), xlab = 'Production', ylab = 'Density', main = 'Kernel Density Plot')

plot(density(census_df_pred_income$Drive), xlab = 'Drive', ylab = 'Density', main = 'Kernel Density Plot')

plot(density(census_df_pred_income$Walk), xlab = 'Walk', ylab = 'Density', main = 'Kernel Density Plot')

plot(density(census_df_pred_income$OtherTransp), xlab = 'OtherTransp', ylab = 'Density', main = 'Kernel Density Plot')

plot(density(census_df_pred_income$WorkAtHome), xlab = 'WorkAtHome', ylab = 'Density', main = 'Kernel Density Plot')

plot(density(census_df_pred_income$Employed), xlab = 'Employed', ylab = 'Density', main = 'Kernel Density Plot')

plot(density(census_df_pred_income$PublicWork), xlab = 'PublicWork', ylab = 'Density', main = 'Kernel Density Plot')

IncomePerCap

plot(density(census_df_pred_ipc$Asian), xlab = 'Asian', ylab = 'Density', main = 'Kernel Density Plot')

plot(density(census_df_pred_ipc$Poverty), xlab = 'Poverty', ylab = 'Density', main = 'Kernel Density Plot')

plot(density(census_df_pred_ipc$Service), xlab = 'Service', ylab = 'Density', main = 'Kernel Density Plot')

plot(density(census_df_pred_ipc$Office), xlab = 'Office', ylab = 'Density', main = 'Kernel Density Plot')

plot(density(census_df_pred_ipc$Construction), xlab = 'Construction', ylab = 'Density', main = 'Kernel Density Plot')

plot(density(census_df_pred_ipc$Production), xlab = 'Production', ylab = 'Density', main = 'Kernel Density Plot')

plot(density(census_df_pred_ipc$Carpool), xlab = 'Carpool', ylab = 'Density', main = 'Kernel Density Plot')

plot(density(census_df_pred_ipc$Walk), xlab = 'Walk', ylab = 'Density', main = 'Kernel Density Plot')

plot(density(census_df_pred_ipc$OtherTransp), xlab = 'OtherTransp', ylab = 'Density', main = 'Kernel Density Plot')

plot(density(census_df_pred_ipc$WorkAtHome), xlab = 'WorkAtHome', ylab = 'Density', main = 'Kernel Density Plot')

plot(density(census_df_pred_ipc$MeanCommute), xlab = 'MeanCommute', ylab = 'Density', main = 'Kernel Density Plot')

plot(density(census_df_pred_ipc$Employed), xlab = 'Employed', ylab = 'Density', main = 'Kernel Density Plot')

plot(density(census_df_pred_ipc$PublicWork), xlab = 'PublicWork', ylab = 'Density', main = 'Kernel Density Plot')

After review the distributions of the variables, it was clear that some appear to be exponentially distributed. Therefore, we will log-transform these variables to make their distribution more linear. Plots of these variables vs Income/IncomePerCap are shown below both before log-transformation and after.

Note: When we log-transform, we will add constant to the data to avoid taking the log of 0.

Income

constant = 1
census_df_pred_income$logPoverty = log10(census_df_pred_income$Poverty+constant)
census_df_pred_income$logPublicWork = log10(census_df_pred_income$PublicWork+constant)
census_df_pred_income$logService = log10(census_df_pred_income$Service+constant)
census_df_pred_income$logConstruction = log10(census_df_pred_income$Construction+constant)
census_df_pred_income$logProduction = log10(census_df_pred_income$Production+constant)
census_df_pred_income$logWalk = log10(census_df_pred_income$Walk+constant)

par(mfrow = c(1,2))
plot(census_df_pred_income$Poverty, census_df_pred_income$Income, xlab = 'Poverty', ylab = 'Income')
plot(census_df_pred_income$logPoverty, census_df_pred_income$Income, xlab = 'logPoverty', ylab = 'Income')

plot(census_df_pred_income$PublicWork, census_df_pred_income$Income, xlab = 'PublicWork', ylab = 'Income')
plot(census_df_pred_income$logPublicWork, census_df_pred_income$Income, xlab = 'logPublicWork', ylab = 'Income')

plot(census_df_pred_income$Service, census_df_pred_income$Income, xlab = 'Service', ylab = 'Income')
plot(census_df_pred_income$logService, census_df_pred_income$Income, xlab = 'logService', ylab = 'Income')

plot(census_df_pred_income$Construction, census_df_pred_income$Income, xlab = 'Construction', ylab = 'Income')
plot(census_df_pred_income$logConstruction, census_df_pred_income$Income, xlab = 'logConstruction', ylab = 'Income')

plot(census_df_pred_income$Production, census_df_pred_income$Income, xlab = 'Production', ylab = 'Income')
plot(census_df_pred_income$logProduction, census_df_pred_income$Income, xlab = 'logProduction', ylab = 'Income')

plot(census_df_pred_income$Walk, census_df_pred_income$Income, xlab = 'Walk', ylab = 'Income')
plot(census_df_pred_income$logWalk, census_df_pred_income$Income, xlab = 'logWalk', ylab = 'Income')

IncomePerCap

constant = 1
census_df_pred_ipc$logPoverty = log10(census_df_pred_ipc$Poverty+constant)
census_df_pred_ipc$logService = log10(census_df_pred_ipc$Service+constant)
census_df_pred_ipc$logConstruction = log10(census_df_pred_ipc$Construction+constant)
census_df_pred_ipc$logProduction = log10(census_df_pred_ipc$Production+constant)
census_df_pred_ipc$logMeanCommute = log10(census_df_pred_ipc$MeanCommute+constant)
census_df_pred_ipc$logPublicWork = log10(census_df_pred_ipc$PublicWork+constant)


par(mfrow = c(1,2))
plot(census_df_pred_ipc$Poverty, census_df_pred_ipc$IncomePerCap, xlab = 'Poverty', ylab = 'Income Per Capital')
plot(census_df_pred_ipc$logPoverty, census_df_pred_ipc$IncomePerCap, xlab = 'Log(Poverty)', ylab = 'Income Per Capital')

plot(census_df_pred_ipc$Service, census_df_pred_ipc$IncomePerCap, xlab = 'Service', ylab = 'Income Per Capital')
plot(census_df_pred_ipc$logService, census_df_pred_ipc$IncomePerCap, xlab = 'log(Service)', ylab = 'Income Per Capital')

plot(census_df_pred_ipc$Construction, census_df_pred_ipc$IncomePerCap, xlab = 'Construction', ylab = 'Income Per Capital')
plot(census_df_pred_ipc$logConstruction, census_df_pred_ipc$IncomePerCap, xlab = 'log(Construction)', ylab = 'Income Per Capital')

plot(census_df_pred_ipc$Production, census_df_pred_ipc$IncomePerCap, xlab = 'Production', ylab = 'Income Per Capital')
plot(census_df_pred_ipc$logProduction, census_df_pred_ipc$IncomePerCap, xlab = 'log(Production)', ylab = 'Income Per Capital')

plot(census_df_pred_ipc$MeanCommute, census_df_pred_ipc$IncomePerCap, xlab = 'MeanCommute', ylab = 'Income Per Capital')
plot(census_df_pred_ipc$logMeanCommute, census_df_pred_ipc$IncomePerCap, xlab = 'log(MeanCommute)', ylab = 'Income Per Capital')

plot(census_df_pred_ipc$PublicWork, census_df_pred_ipc$IncomePerCap, xlab = 'PublicWork', ylab = 'Income Per Capital')
plot(census_df_pred_ipc$logPublicWork, census_df_pred_ipc$IncomePerCap, xlab = 'log(PublicWork)', ylab = 'Income Per Capital')

Now we can compare our previously optimized model with 14 variables: Hispanic, Black, Citizen, Poverty, Service, Office, Construction, Production, Drive, Walk, OtherTransp, WorkAtHome, Employed, and PublicWork, to the optimized model with logPoverty, logPublicWork, logService, logConstruction, logProduction, and logWalk as log-transformed variables. We will evaluate the models using the MSE of their test set.

set.seed(3)
k = 10
folds = cut(seq(1,nrow(census_df_pred_income)),breaks=10,labels=FALSE)
folds = folds[sample(length(folds))]
cv.errors.income = matrix(NA,k,2)

for (j in 1:k){
  lm.fit.income.1 = lm(Income ~ Hispanic + Black + Citizen + Poverty + Service + Office + Construction + Production + Drive + Walk + OtherTransp + WorkAtHome + Employed + PublicWork, data=census_df_pred_income[folds!=j,])
  lm.fit.income.2 = lm(Income ~ Hispanic + Black + Citizen + logPoverty + logService + Office + logConstruction + logProduction + Drive + logWalk + OtherTransp + WorkAtHome + Employed + logPublicWork, data=census_df_pred_income[folds!=j,])
  pred.1 = predict(lm.fit.income.1,census_df_pred_income[folds==j,])
  pred.2 = predict(lm.fit.income.2,census_df_pred_income[folds==j,])
  cv.errors.income[j,1] = mean((census_df_pred_income$Income[folds==j]-pred.1)^2)
  cv.errors.income[j,2] = mean((census_df_pred_income$Income[folds==j]-pred.2)^2)
}
print(cv.errors.income)
##            [,1]      [,2]
##  [1,] 165803463 141292976
##  [2,] 370150361 286346429
##  [3,] 186249509 149243263
##  [4,] 221779556 162935341
##  [5,] 147475885 106894802
##  [6,] 239646057 180608307
##  [7,] 218514324 176781391
##  [8,] 179127636 135046604
##  [9,] 160662798 126969856
## [10,] 221055001 189606958
mean.cv.errors.income = apply(cv.errors.income,2,mean)
print(mean.cv.errors.income)
## [1] 211046459 165572593

Clearly, the MSE of the model with log-transformed variables is much smaller than the MSE of the model with just our 14 variables.

set.seed(4)
k = 10
folds = cut(seq(1,nrow(census_df_pred_ipc)),breaks=10,labels=FALSE)
folds = folds[sample(length(folds))]
cv.errors.ipc = matrix(NA,k,2)

for (j in 1:k){
  lm.fit.3 = lm(IncomePerCap ~ Asian + Poverty + Service + Office + Construction + Production + Carpool + Walk + OtherTransp + WorkAtHome + MeanCommute + Employed + PublicWork, data=census_df_pred_ipc[folds!=j,])
  lm.fit.4 = lm(IncomePerCap ~ Asian + logPoverty + logService + Office + logConstruction + logProduction + Carpool + Walk + OtherTransp + WorkAtHome + logMeanCommute + Employed + logPublicWork, data=census_df_pred_ipc[folds!=j,])
  pred.3 = predict(lm.fit.3,census_df_pred_ipc[folds==j,])
  pred.4 = predict(lm.fit.4,census_df_pred_ipc[folds==j,])
  cv.errors.ipc[j,1] = mean((census_df_pred_ipc$IncomePerCap[folds==j]-pred.3)^2)
  cv.errors.ipc[j,2] = mean((census_df_pred_ipc$IncomePerCap[folds==j]-pred.4)^2)
}
print(cv.errors.ipc)
##            [,1]      [,2]
##  [1,] 167619474 117572774
##  [2,] 138375702 103284079
##  [3,] 143333240 106582157
##  [4,] 285164816 215894718
##  [5,] 124510063  95241703
##  [6,] 279426027 190512281
##  [7,] 188766048 133443393
##  [8,] 157029219 133632771
##  [9,] 141613222  94261977
## [10,] 172742487 133530140
mean.cv.errors.ipc = apply(cv.errors.ipc,2,mean)
print(mean.cv.errors.ipc)
## [1] 179858030 132395599

Once again, we can see by examining the cross-validated MSE of both models, the model with log-transformed variables improves the model.

We can now use our optimized subset variables and use Ridge/Lasso Regression instead of normal linear regression. We will use cross-validation to select the optimal lambda values and to calculate the MSE on a test set.

xi <- model.matrix(Income ~ Hispanic + Black + Citizen + logPoverty + logService + Office + logConstruction + logProduction + Drive + logWalk + OtherTransp + WorkAtHome + Employed + logPublicWork, census_df_pred_income)
xi <- xi[,-c(1)] # remove the intercept term
yi <- census_df_pred_income$Income

set.seed(1)
traini=sample(1:nrow(xi), nrow(xi)/2)
testi=(-traini)
yi.test=yi[testi]

grid=10^seq(10,-2,length=100)    #Grid of lambda values

ridge.mod.income=glmnet(xi[traini,],yi[traini],alpha=0,lambda=grid, thresh=1e-12)

set.seed(100)
cv.out.income=cv.glmnet(xi[traini,],yi[traini],alpha=0,nfolds=10)
plot(cv.out.income)

bestlam_income=cv.out.income$lambda.min   #Lambda with minimum MSE
ridge.pred.income=predict(ridge.mod.income,s=bestlam_income,newx=xi[testi,])
mean((ridge.pred.income-yi.test)^2)  #Test MSE associated with smallest lambda
## [1] 157422938
out_income=glmnet(xi,yi,alpha=0)
predict(out_income,type="coefficients",s=bestlam_income)  #Now get ridge coefficients for model with best lambda
## 15 x 1 sparse Matrix of class "dgCMatrix"
##                             1
## (Intercept)     189224.711976
## Hispanic           -16.471431
## Black                8.966549
## Citizen             -2.259999
## logPoverty      -38538.923702
## logService      -34621.859573
## Office            -573.333977
## logConstruction  -7435.272119
## logProduction   -14263.721486
## Drive              256.210173
## logWalk          -2939.692642
## OtherTransp        905.883220
## WorkAtHome         562.032707
## Employed             2.677705
## logPublicWork    -9676.224297

As we can see, the ridge regression predicting Income has a smaller MSE than the previous linear model. The coefficients show that Poverty and Service are very predictive for Income (inversely so).

xipc <- model.matrix(IncomePerCap ~ Asian + logPoverty + logService + Office + logConstruction + logProduction + Carpool + Walk + OtherTransp + WorkAtHome + logMeanCommute + Employed + logPublicWork, census_df_pred_ipc)
xipc <- xipc[,-c(1)] # remove the intercept term
yipc <- census_df_pred_ipc$IncomePerCap

set.seed(1)
train_ipc=sample(1:nrow(xipc), nrow(xipc)/2)
test_ipc=(-train_ipc)
yipc.test=yipc[test_ipc]

grid=10^seq(10,-2,length=100)    #Grid of lambda values

ridge.mod.ipc=glmnet(xipc[train_ipc,],yipc[train_ipc],alpha=0,lambda=grid, thresh=1e-12)

set.seed(100)
cv.out.ipc=cv.glmnet(xipc[train_ipc,],yipc[train_ipc],alpha=0,nfolds=10)
plot(cv.out.ipc)

bestlam_ipc=cv.out.ipc$lambda.min   #Lambda with minimum MSE
ridge.pred.ipc=predict(ridge.mod.ipc,s=bestlam_ipc,newx=xipc[test_ipc,])
mean((ridge.pred.ipc-yipc.test)^2)  #Test MSE associated with smallest lambda
## [1] 121827455
out_ipc=glmnet(xipc,yipc,alpha=0)
predict(out_ipc,type="coefficients",s=bestlam_ipc)  #Now get ridge coefficients for model with best lambda
## 14 x 1 sparse Matrix of class "dgCMatrix"
##                             1
## (Intercept)     162431.693148
## Asian              -29.311726
## logPoverty      -20855.543176
## logService      -29418.039357
## Office            -414.307741
## logConstruction -10685.698784
## logProduction   -16595.696995
## Carpool           -170.236564
## Walk               116.594876
## OtherTransp       1197.254708
## WorkAtHome         222.038050
## logMeanCommute  -16422.478997
## Employed             0.997845
## logPublicWork    -9175.909717

Once again, the Ridge Regression does in fact have a lower MSE! Poverty and Service are again very predictive, but in this model, MeanCommute and Production are also very predictive.

Let’s test Lasso (again with optimized lambda values)!

dim(xi)
## [1] 2095   14
lasso.mod.income=glmnet(xi[traini,],yi[traini],alpha=0,lambda=grid)
plot(lasso.mod.income)

set.seed(1)
cv.out.income=cv.glmnet(xi[traini,],yi[traini],alpha=1)
plot(cv.out.income)

bestlam_income=cv.out.income$lambda.min
lasso.pred.income=predict(lasso.mod.income,s=bestlam_income,newx=xi[testi,])
mean((lasso.pred.income-yi.test)^2)
## [1] 154777322
out_income=glmnet(xi,yi,alpha=1,lambda=grid)
lasso.coef.income=predict(out_income,type="coefficients",s=bestlam_income)
lasso.coef.income
## 15 x 1 sparse Matrix of class "dgCMatrix"
##                             1
## (Intercept)     193118.301907
## Hispanic            25.359147
## Black               41.357423
## Citizen             -4.605524
## logPoverty      -40067.734160
## logService      -39065.421622
## Office            -583.330870
## logConstruction  -8205.594509
## logProduction   -14124.876147
## Drive              317.777860
## logWalk          -1610.850456
## OtherTransp        885.548935
## WorkAtHome         553.225286
## Employed             5.674311
## logPublicWork    -9279.266987

Lasso Regression returns about the same test MSE as Ridge Regression for this model. However, The lasso regression does not utilize all of the variables given. The only variables with weight associate in the lasso regression are: Poverty, Service, Production, OtherTransp, and WorkAtHome. This is consistent with what we found in the ridge regression - Some variables are far more predictive than others. Ultimately, either the Ridge or Lasso Regressions produce great models for predicting IncomePerCap over the optimized, transformed subset of variables.

dim(xipc)
## [1] 2100   13
lasso.mod.ipc=glmnet(xipc[train_ipc,],yipc[train_ipc],alpha=0,lambda=grid)
plot(lasso.mod.ipc)

set.seed(1)
cv.out.ipc=cv.glmnet(xipc[train_ipc,],yipc[train_ipc],alpha=1)
plot(cv.out.ipc)

bestlam_ipc=cv.out.ipc$lambda.min
lasso.pred.ipc=predict(lasso.mod.ipc,s=bestlam_ipc,newx=xipc[test_ipc,])
mean((lasso.pred.ipc-yipc.test)^2)
## [1] 120754522
out_ipc=glmnet(xipc,yipc,alpha=1,lambda=grid)
lasso.coef.ipc=predict(out_ipc,type="coefficients",s=bestlam_ipc)
lasso.coef.ipc
## 14 x 1 sparse Matrix of class "dgCMatrix"
##                             1
## (Intercept)      1.589693e+05
## Asian           -3.184934e+01
## logPoverty      -2.090588e+04
## logService      -3.289470e+04
## Office          -4.392563e+02
## logConstruction -1.088677e+04
## logProduction   -1.675182e+04
## Carpool         -1.602653e+02
## Walk             1.143845e+02
## OtherTransp      1.236576e+03
## WorkAtHome       1.050498e+02
## logMeanCommute  -9.882854e+03
## Employed         8.895556e-01
## logPublicWork   -1.001348e+04

In the IncomePerCap Model, we can see that lasso regresssion does improve the MSE over Ridge Regression. Thus, the lasso regession with the optimal, transformed subset variables is best for predicting IncomePerCap. It is also interesting that none of the coefficients in the lasso regression of this model are 0. This is a strong indication that all of these variables contain predictive value for IncomePerCap.